3 Cluster analysis
3.1 Cluster prevalence
Number of strategies each cluster was captured in.
cluster_prevalence <- cluster_counts %>%
filter(read_count>0) %>%
group_by(secondary_cluster) %>%
summarise(n_strategy = n_distinct(overall_strategy)) %>%
arrange(desc(n_strategy))
#Arranged by phylum and cluster prevalence
cluster_prevalence_phylum <- cluster_prevalence %>%
left_join(cluster_taxonomy,by="secondary_cluster")3.2 Cluster frequency
Number of clusters each strategy recovered.
cluster_number <- cluster_counts %>%
filter(read_count>0) %>%
group_by(overall_strategy) %>%
summarise(n = n_distinct(secondary_cluster)) %>%
arrange(desc(n)) %>%
mutate(overall_strategy=factor(overall_strategy,levels=rev(overall_strategy)))
cluster_number %>%
rename(Strategy=overall_strategy) %>%
select(Strategy,n) %>%
tt()| Strategy | n |
|---|---|
| coassembly_timepoint_all | 176 |
| coassembly_animal | 163 |
| coassembly_timepoint_cage | 160 |
| multicoverage_animal | 130 |
| multicoverage_timepoint_all | 126 |
| single_coverage | 117 |
| multisplit_timepoint_all | 115 |
| multisplit_timepoint_cage | 110 |
| multisplit_animal | 99 |
| multicoverage_timepoint_cage | 93 |
cluster_number %>%
filter(overall_strategy != "cobinning_treatment") %>%
ggplot(aes(x = n, y = overall_strategy)) +
geom_col(color="#666666") +
coord_cartesian(xlim = c(90,200))+
theme_classic() +
theme() +
labs(x="Number of clusters", y="Strategy")3.3 Cluster heatmap
Overview of clusters per strategy.
cluster_counts %>%
left_join(cluster_taxonomy,by="secondary_cluster") %>%
mutate(Phylum=ifelse(is.na(Phylum),"Bacillota_A",Phylum)) %>%
mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label))) %>%
mutate(overall_strategy=factor(overall_strategy,levels=rev(cluster_number$overall_strategy))) %>%
group_by(overall_strategy, secondary_cluster) %>%
sample_n(1) %>%
ungroup() %>%
ggplot(aes(x = secondary_cluster, y = overall_strategy, fill=Phylum)) +
geom_tile(color="#ffffff") +
scale_fill_manual(values=phylum_colors) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.title.y = element_blank()) +
xlab("MAG clusters")3.4 Cluster abundance
Maximum read count
maximum_read_count <- cluster_counts %>%
group_by(secondary_cluster) %>%
slice_max(order_by = read_count) %>%
ungroup()maximum_read_count %>%
mutate(secondary_cluster=factor(secondary_cluster,levels=rev(cluster_tree$tip.label))) %>%
ggplot(aes(x = secondary_cluster, y = read_count)) +
geom_col(color="#666666") +
theme_classic() +
theme(axis.text.x = element_blank()) +
labs(x="MAG clusters", y="Number of reads")3.5 MAG mapping
Average percentage of read mapped in each strategy against the respective catalogue.
mag_mapping <- bin_counts %>%
mutate(sample = str_replace_all(sample, "_subsam", ""),
sample = str_replace_all(sample, "_M", "")) %>%
left_join(all_counts, by = join_by(sample)) %>%
summarise(mag_mapping = sum(read_count) / mean(total_count), .by = c(sample, overall_strategy)) %>%
summarise(mag_mapping = mean(mag_mapping), .by = overall_strategy) %>%
mutate(overall_strategy=factor(overall_strategy,levels=rev(cluster_number$overall_strategy)))
mag_mapping %>%
filter(!is.na(overall_strategy)) %>%
rename(Strategy=overall_strategy,`Mapping %`=mag_mapping) %>%
mutate(`Mapping %`=`Mapping %`*100) %>%
tt()| Strategy | Mapping % |
|---|---|
| single_coverage | 80.25097 |
| multicoverage_animal | 79.78157 |
| multicoverage_timepoint_cage | 79.68070 |
| multicoverage_timepoint_all | 80.48701 |
| coassembly_animal | 80.91453 |
| coassembly_timepoint_all | 82.53138 |
| coassembly_timepoint_cage | 80.11122 |
| multisplit_animal | 68.33532 |
| multisplit_timepoint_all | 83.70626 |
| multisplit_timepoint_cage | 77.11125 |
mag_mapping %>%
filter(overall_strategy != "cobinning_treatment") %>%
ggplot(aes(x = mag_mapping, y = overall_strategy)) +
geom_col(color="#666666") +
coord_cartesian(xlim = c(0.6,0.9))+
theme_classic() +
theme() +
labs(x="Mapping percentage", y="Strategy")3.6 Domain-adjusted mapping rates (DAMR)
#import SingleM estimates
smf_files <- list.files(path = "data/singlem/",
pattern = "*tsv.gz",
full.names = T)
smf_estimates <- purrr::map(smf_files, read_delim) %>%
bind_rows()
mean_smf <- smf_estimates %>%
mutate(read_fraction = as.numeric(str_replace(read_fraction, "%", ""))) %>%
pull(read_fraction) %>%
mean()
mag_mapping %>%
filter(!is.na(overall_strategy)) %>%
rename(Strategy=overall_strategy) %>%
mutate(mag_mapping = mag_mapping * 100,
DAMR = (mag_mapping / mean_smf) * 100,
DAMR = scales::number(if_else(DAMR > 100, 100, DAMR),
accuracy = 0.1)
) %>%
select(-mag_mapping) %>%
tt()| Strategy | DAMR |
|---|---|
| single_coverage | 100.0 |
| multicoverage_animal | 100.0 |
| multicoverage_timepoint_cage | 100.0 |
| multicoverage_timepoint_all | 100.0 |
| coassembly_animal | 100.0 |
| coassembly_timepoint_all | 100.0 |
| coassembly_timepoint_cage | 100.0 |
| multisplit_animal | 87.9 |
| multisplit_timepoint_all | 100.0 |
| multisplit_timepoint_cage | 99.2 |
3.7 Read counts vs prevalence
maximum_read_count %>%
left_join(cluster_prevalence,by=join_by(secondary_cluster==secondary_cluster)) %>%
left_join(cluster_taxonomy,by="secondary_cluster") %>%
select(read_count, n_strategy, Phylum) %>%
ggplot(aes(x = read_count, y = n_strategy, group=n_strategy, color=Phylum)) +
geom_boxplot(color="#999999", fill="#f4f4f4", outlier.shape = NA) +
scale_y_continuous(breaks=seq(1,10,1))+
scale_color_manual(values=phylum_colors) +
geom_jitter() +
theme_classic() +
theme() +
labs(x="Sequencing depth", y="Strategy prevalence")